Background

Home location is the most important factor in home value. Identical homes can have dramatically different values due to the location. One of the factors that home buyers concern is the school district.

Hypothesis

The average prices of nearby houses are higher when the high schools get higher ranking.

if (require(dplyr) == FALSE) {install.packages('dplyr')}
if (require(ggmap) == FALSE) {install.packages('ggmap')}
if (require(XML) == FALSE) {install.packages('XML')}
if (require(gridExtra) == FALSE) {install.packages('gridExtra')}
if (require(psych) == FALSE) {install.packages('psych')}
library(dplyr)
library(ggmap)
library(ggplot2)
library(XML)
library(gridExtra)
library(psych)
if (require(devtools) == FALSE) {install.packages('devtools')}
library(devtools)
if (require(rvest) == FALSE) {devtools::install_github("hadley/rvest")}
library(rvest) 
library(stringr)
library(reshape2)

Data acquisition

1) The rank and address of high school were web-scraped from the U.S. News rankings using rvest package.

Website: http://www.usnews.com/education/best-high-schools/national-rankings

The U.S. News rankings include data on more than 19,400 public high schools in 50 states and the District of Columbia. However, only around 2100 public schools have ranking.

# Extract school rank, school name and address from usnews.com

web <- 'http://www.usnews.com/education/best-high-schools/national-rankings/spp+100/page+'
school <- data.frame(rankingscore = numeric(0), schoolname = character(0), 
                     street = character(0), csz = character(0))
for (i in 1:21){
    url <- paste0(web, i)
    website <- html(url)
    rankingscore <- website %>% html_nodes("span.rankings-score span") %>% html_text()
    if (length(rankingscore)< 100){
        rankingscore <- append(rankingscore, rep(NA, 100-length(rankingscore)))
    }
    schoolname <- website %>% html_nodes("div.school-name a") %>% html_text()
    street <- website %>% html_nodes("div.school-street") %>% html_text()
    csz <- website %>% html_nodes("div.school-csz") %>% html_text()
    all <- data.frame(cbind(rankingscore, schoolname, street, csz), 
                      stringsAsFactors = FALSE)
    school <- rbind(school, all)
}

# Convert ranking score into numeric format

school$rankingscore <- as.numeric(str_replace_all(school$rankingscore, "[#,]", ""))
school$schoolname <- str_replace_all(school$schoolname, "(<U\\+200B>)", "")

# split csz into city, state and zipcode

csz <- colsplit(school$csz, ',', c('city', 'sz')) 
csz1 <- data.frame(str_split_fixed(str_trim(csz$sz), ' ', n = 2), 
                   stringsAsFactors = FALSE)
names(csz1) <- c('state', 'zipcode')
split_csz<- setNames(data.frame(cbind(csz$city, csz1$state, csz1$zipcode),
                                stringsAsFactors = FALSE), c('city', 'state', 'zipcode'))

# Combine into final school dataframe
schoolranking <- cbind(school[1:3], split_csz)

2) Zillow Home Value Index (zillowIndex) was used to present the average home price around the ranked high schools.

The Zillow Home Value Index is the median Zestimate home valuation (Zillow’s estimated market value). ZillowIndex was queried through Zillow GetDemographics API. Website: http://www.zillow.com/howto/api/GetDemographics.htm

Due to the limitation of 1000 queries per day by zillow API, the dataset was devided into 3 parts (900, 900 and 300 records respectively). The query was run once per day for each part. The completed dataset was combined using rbind() function.

# Create function that takes zipcode and query the zillow home value index from zillowAPI

getZillow <- function(zip){
    url <- paste0("http://www.zillow.com/webservice/GetDemographics.htm?zws-id=X1-ZWz1dx93rz7eob_9fxlx&zip=", zip)
    doc <- xmlParse(url)
    e <- xpathSApply(doc, path = "//response/pages//attribute/name[contains(text(), 'Zillow Home Value Index')]/..//zip/value", xmlValue)
    return (e)
}

# Divide schoolranking into three parts, retrieve zillowindex from zillow API one part a day. 
part1 <- schoolranking[1:900, ]
part2 <- schoolranking[901:1800, ]
part3 <- schoolranking[1801:2100, ]

zillowindex1 <- lapply(part1$zipcode, getZillow)
zillowindex1[sapply(zillowindex1, is.null)] <- NA
zillowindex1 <- unlist(zillowindex1)
part1final <- cbind(part1, zillowindex1)

zillowindex2 <- lapply(part2$zipcode, getZillow)
zillowindex2[sapply(zillowindex2, is.null)] <- NA
zillowindex2 <- unlist(zillowindex2)
part2final <- cbind(part2, zillowindex2)

zillowindex3 <- lapply(part3$zipcode, getZillow)
zillowindex3[sapply(zillowindex3, is.null)] <- NA
zillowindex3 <- unlist(zillowindex3)
part3final <- cbind(part3, zillowindex3)

row.names(part1final)<-NULL
row.names(part2final)<-NULL
row.names(part3final)<-NULL

names(part1final) <- c('rank', 'school', 'street', 'city', 'state', 
                       'zipcode', 'zillowindex')
names(part2final) <- c('rank', 'school', 'street', 'city', 'state', 
                       'zipcode', 'zillowindex')
names(part3final) <- c('rank', 'school', 'street', 'city', 'state', 
                       'zipcode', 'zillowindex')

rank <- rbind(part1final, part2final, part3final)

save(rank, file = "C:/temp/rank.rda")

3) The latitude and longitude coordinates of the zipcodes were obtained by ggmap package.

# Obtain lat and lon by geocode() function in ggmap package
lonlat <- geocode(rank$zipcode)
schoolrank <- cbind(rank, lonlat)
save(schoolrank, file = "C:/temp/schoolrank.rda")

The final dataset was saved as .rda file and uploaded to the public folder in dropbox.

Data transformation and cleanup

The .rda file was downloaded from dropbox directly.

# load("C:/temp/schoolrank.rda") 

setInternet2(use = TRUE)

download.file("https://dl.dropboxusercontent.com/u/8963938/schoolrank.rda", 
              destfile="./schoolrank.rda")

load("./schoolrank.rda")

1) Convert zillowIndex from character into numeric

schoolrank$zillowindex <- as.character(schoolrank$zillowindex)
schoolrank$zillowindex <- as.numeric(schoolrank$zillowindex)

2) Convert rank column into integer

schoolrank$rank <- as.integer(schoolrank$rank)

3) Remove the records in ‘rank’ column that contains NA (without rank)

school <- schoolrank %>% filter (!is.na(rank))
# From 2100 to 2019 

4) Remove the records in ‘zillowindex’ column that contains NA (no local zillowIndex on that zipcode)

school <- school %>% filter (!is.na(zillowindex))
# From 2019 to 1472

5) Remove the state that has less than 3 schools (TX and DE)

state.lt3 <- as.list(school %>% group_by(state) %>% 
                         summarise(count = n()) %>% filter(count < 3) %>% 
                         select(state))
names(state.lt3)<-NULL
state.lt3<-unlist(state.lt3)

school <- school[!school$state %in% state.lt3,] 
# From 1472 to 1469

Descriptive statistics

1) Summary of dataset

summary(school)
##       rank           school             street              city          
##  Min.   :   2.0   Length:1469        Length:1469        Length:1469       
##  1st Qu.: 472.0   Class :character   Class :character   Class :character  
##  Median : 944.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 966.1                                                           
##  3rd Qu.:1439.0                                                           
##  Max.   :2019.0                                                           
##     state             zipcode           zillowindex           lon         
##  Length:1469        Length:1469        Min.   :  35700   Min.   :-158.16  
##  Class :character   Class :character   1st Qu.: 183200   1st Qu.:-117.86  
##  Mode  :character   Mode  :character   Median : 294200   Median : -86.23  
##                                        Mean   : 388454   Mean   : -95.03  
##                                        3rd Qu.: 495100   3rd Qu.: -77.80  
##                                        Max.   :3068900   Max.   : -70.30  
##       lat       
##  Min.   :20.85  
##  1st Qu.:34.18  
##  Median :39.17  
##  Mean   :38.23  
##  3rd Qu.:41.70  
##  Max.   :48.76

2) Total states included in the data: 34

length(unique(school$state))
## [1] 34

3) Rank: from 2 to 2019

summary(school$rank)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     2.0   472.0   944.0   966.1  1439.0  2019.0

4) zillowIndex: range from 35,700 to 3,068,900, right-skewed

describe(school$zillowindex)
##   vars    n     mean       sd median  trimmed    mad   min     max   range
## 1    1 1469 388453.5 315638.3 294200 336042.1 196148 35700 3068900 3033200
##   skew kurtosis      se
## 1 2.72     11.9 8235.29
ggplot(school, aes(x=zillowindex)) + 
    geom_histogram(aes(y=..density..), binwidth=80000, color='black', fill='white') +
    geom_density(alpha=.2, fill="#003399") +
    ggtitle('ZillowIndex histogram overlaid with density curve') +
    theme_bw()

# Ref: http://www.cookbook-r.com/Graphs/Plotting_distributions_%28ggplot2%29/

Data analysis

1) Geographical distribution of rank and zillowIndex

Expensive houses are located at west coast and northeast coast.

map <- get_map(location = 'United states', zoom = 4, 
               maptype = 'roadmap', source = 'google', color = 'color')
ggmap(map) + geom_point(aes(x=lon, y=lat, colour=zillowindex), 
                        data=school,  alpha = 0.6, size=3, na.rm = T)  + 
  scale_color_gradient(low="#3333FF", high="#FF3300") +
    ggtitle('Distribution: Zillow House Value Index')

Schools with different ranks are more evenly distributed.

ggmap(map) + geom_point(aes(x=lon, y=lat, colour=rank), data=school, 
                        alpha = 0.8, size=3, na.rm = T)  + 
    scale_color_gradient(low="#FF3300", high="#3333FF") +
    ggtitle('Distribution: School rank')

2) Relationship between school rank and zillowIndex

ggplot(school, aes(x=rank, y=zillowindex/1000)) + 
    geom_point(color= "#0066CC", alpha = 0.6) + coord_trans(y = "log10") + 
    stat_smooth(color="#990066", size = 1, method = "lm") +
    xlab('school rank') +
    ylab("zillowIndex in thousand (log10 scale)") +
    ggtitle('zillowIndex ~ school rank') +
    theme_bw()

Correlation: weak downhill (negative) linear relationship (-0.3305684)

cor(school$zillowindex, school$rank)
## [1] -0.3305684

3) summary statistics: zillowIndex and school rank group by states

describe <- describeBy(school$zillowindex, school$state, mat=TRUE)
describe_rank <- describeBy(school$rank, school$state, mat=TRUE)

p1 <- ggplot(describe, aes(x=group1, y=mean/1000)) + 
  geom_bar(stat="identity", fill="#003399") +
  geom_text(aes(label=n), vjust = -1) +
  xlab('State') +
  ylim(0, 750) +
  ylab('average zillow index in thousand') +
  ggtitle('Average zillow index in states') +
  theme_bw()

p2 <- ggplot(describe_rank, aes(x=group1, y=mean)) + 
  geom_bar(stat="identity", fill="#003399") +
  geom_text(aes(label=round(mean)), vjust = -1, size = 3) +
  xlab('State') +
  ylim(0, 1650) +
  ylab('average school rank') +
  ggtitle('Average school rank in states') +
  theme_bw()

grid.arrange(p1, p2, nrow=2)

Correlation for each state.

school %>% 
  group_by(state) %>% 
  summarize(count= n(), cor = cor(rank, zillowindex)) %>% 
  arrange(cor)
## Source: local data frame [34 x 3]
## 
##    state count         cor
## 1     NE     3 -0.91101025
## 2     VT     7 -0.88602865
## 3     AR    11 -0.80913707
## 4     HI     4 -0.69531463
## 5     NC    32 -0.67183477
## 6     AL     9 -0.61729479
## 7     OR    23 -0.58814507
## 8     MD    41 -0.58771063
## 9     MN    26 -0.58192942
## 10    VA    35 -0.57194138
## 11    WA    47 -0.56875469
## 12    CA   399 -0.51006703
## 13    TN    16 -0.47679506
## 14    MI    46 -0.45735161
## 15    MA    60 -0.45327238
## 16    PA    47 -0.44240882
## 17    OH   105 -0.42968357
## 18    IL    70 -0.42124402
## 19    GA    47 -0.40742061
## 20    NJ    36 -0.38357194
## 21    CT    37 -0.34897600
## 22    NV     7 -0.26047620
## 23    NY   110 -0.24423338
## 24    CO    40 -0.22573579
## 25    NH    13 -0.20772438
## 26    FL    80 -0.12407612
## 27    AZ    42 -0.09834842
## 28    SC    11 -0.07792724
## 29    RI     5 -0.06903855
## 30    WI    25 -0.04376106
## 31    WV     5  0.03251108
## 32    KY    10  0.11159924
## 33    OK    11  0.35832774
## 34    IA     9  0.37455415

Conclusions

Across the country, there is weak coorrelation between rank of school and average house value (zillow house value index) in surrounding area. By state, the correlation coefficients were highly different. Most of states show negative correlation between school and average house value.

Discussion

Although there is trend that the average house value is higher when rank of school is better, it’s not strong correlated. Other factors, such as local economy and security of the neighborhood, also play important roles on house value. This analysis was only based on the top 10% of total public high schools (1469 schools in 34 states out of around 20,000 public high schools in the USA). The results may not represent the overall population.

New features

  1. ggmap

  2. psych: describe(), describeBy()

Challenges

  1. Data Acquisition
  1. Data analysis

Appendix

Appendix 1. Plot of zillowIndex and rank in Arkansas (cor = -0.80913707)

school %>% filter(state == 'AR') %>% 
    ggplot(aes(x=rank, y=zillowindex/1000)) + 
    geom_point(color= "#0066CC", alpha = 0.6) + 
    stat_smooth(color="#990066", size = 1, method = "lm", se=FALSE) +
    xlab('school rank') +
    ylab("zillowIndex in thousands") +
    ggtitle('zillowIndex ~ school rank in Arkansas') +
    theme_bw()

Appendix 2. Bin the zillowIndex into three catogeries: low (below Q1), medium (Q1 to Q3) and high (above Q3).

school$valuelevel <- cut(school$zillowindex, breaks = c(0, 183200, 495100, 3069000), 
                         labels=c("Low","Medium","High"), include.lowest=TRUE)
ggplot(school, aes(x=rank, y=zillowindex/10000)) + 
    geom_point(alpha = 0.6) + 
    stat_smooth(method = "lm") +
    facet_wrap(~valuelevel, scales = "free_y") +
    xlab('school rank') +
    ylab('zillowIndex in thousands') 

Medium and high zillowIndex tend to have negative correlation, while low zillowIndex didn’t show much correlation.

ggplot(school, aes(x=valuelevel, y=rank)) +
    geom_boxplot() +
    xlab('zillowIndex') +
    ylab('school rank')

Correlation for each house value level.

school %>% 
  group_by(valuelevel) %>% 
  summarize(count= n(), cor = cor(rank, zillowindex)) 
## Source: local data frame [3 x 3]
## 
##   valuelevel count         cor
## 1        Low   368  0.08310833
## 2     Medium   734 -0.16234948
## 3       High   367 -0.21903071

No strong correlation for each group.